# Databricks notebook source
pip install jieba stylecloud snownlp

# COMMAND ----------

df = spark.read.csv("/mnt/storage/movies/avatar_water_comments.csv", header=True)
df.printSchema()
display(df)

# COMMAND ----------

display(df.select(df["评分"]).where(df["评分"] != ""))

# COMMAND ----------

import jieba
from pyspark.sql import functions as F
from pyspark.sql.types import ArrayType, StringType

# 自定义函数用于中文分词
def seg_sentence(content):
    return [i for i in jieba.cut(content)]

# 注册自定义函数
seg_sentence=F.udf(seg_sentence,ArrayType(StringType()))

words = " ".join(df.select(seg_sentence('评论内容').alias("words")).rdd.flatMap(lambda x: x.words).collect())

# COMMAND ----------

from stylecloud import gen_stylecloud

# 生成词云图
gen_stylecloud(
    text=words,size=(800,600),icon_name='fas fa-dragon',
    max_font_size=100,max_words=2000,output_name='/dbfs/FileStore/AvatarCloud.png',
    font_path="../../SimHei.ttf",collocations=True,
    custom_stopwords={'你','我','也','是','的','了','都','在','但','和','有'}
)

# COMMAND ----------

# MAGIC %md
# MAGIC 
# MAGIC ![](files/AvatarCloud.png)

# COMMAND ----------

from snownlp import SnowNLP
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

def sentiments(content):
    return SnowNLP(str(content)).sentiments

sentiments=F.udf(sentiments,DoubleType())

display(df.select(sentiments('评论内容')))

# COMMAND ----------

df.createOrReplaceTempView('comments')

display(spark.sql("select case when `赞同数` < 100 then '小于100' when `赞同数` between 100 and 199 then '100至200' when `赞同数` between 200 and 499 then '200至500' when `赞同数` between 500 and 999 then '500至1000' when `赞同数` >= 1000 then '超过1000' end as `点赞数`,count(`赞同数`) from comments group by case when `赞同数` < 100 then '小于100' when `赞同数` between 100 and 199 then '100至200' when `赞同数` between 200 and 499 then '200至500' when `赞同数` between 500 and 999 then '500至1000' when `赞同数` >= 1000 then '超过1000' end"))

# COMMAND ----------

df.createOrReplaceTempView('comments')

display(spark.sql("select replace('TOP' || (100 + row_number() over(order by cast(`赞同数` as int) desc)), 'TOP1', 'TOP') as `热评`, cast(`赞同数` as int) from comments where `赞同数` > 1000 limit 20"))
